import requests
import pandas as pd
import plotly.express as px
# Set up Genius API credentials
access_token = "-fy2dpbY6GKJ93XPpsNRLz-2KJGwFYgOqVjxf-qet5ebM0X5cU-HOXKkNvvjAQqF"
base_url = "https://api.genius.com"
artist_name = 'J. Cole'
# Build the search URL for the artist
search_url = f'{base_url}/search?q={artist_name}'
# Set up headers with Genius API token
header = {
'Authorization': f'Bearer {access_token}'
}
# Make a request to the Genius API
response = requests.get(search_url, headers=header)
data = response.json()
# Check if artist information is available
if 'hits' in data['response'] and data['response']['hits']:
# Access artist information
artist_id = data['response']['hits'][0]['result']['primary_artist']['id']
# Initialize variables for pagination, Genius only allows 20 songs at once
per_page = 20
page = 1
all_artist_songs = []
# Fetch all songs for the artist using pagination
while True:
artist_songs_url = f'{base_url}/artists/{artist_id}/songs?per_page={per_page}&page={page}'
response = requests.get(artist_songs_url, headers=header)
songs_data = response.json()['response']['songs']
if not songs_data:
break # No more songs, exit the loop
# Append songs to the list
all_artist_songs.extend(songs_data)
page += 1
# Remove songs without 'pageviews' field
all_artist_songs = [song for song in all_artist_songs if 'stats' in song and 'pageviews' in song['stats']]
# Extract data for plotting
song_titles = [song['title'] for song in all_artist_songs]
view_counts = [song['stats']['pageviews'] for song in all_artist_songs]
release_dates = [song['release_date_components'] for song in all_artist_songs]
# Create a DataFrame from the extracted data
df = pd.DataFrame({
'Song Title': song_titles,
'View Count': view_counts,
'Release Date': release_dates
})
# Convert release dates to a readable format
df['Release Date'] = pd.to_datetime(df['Release Date'].apply(lambda x: f"{x['year']}-{x['month']}-{x['day']}" if x is not None else None), errors='coerce')
# Create an interactive scatter plot
fig = px.scatter(df, x='Release Date', y='View Count', text='Song Title', title=f'View Count for {artist_name}')
# Customize plot details
fig.update_traces(textposition='top center', texttemplate='%{text}', hovertemplate='%{text}<br>Release Date: %{x}<br>View Count: %{y}', mode='markers')
fig.show()
else:
print(f"Artist '{artist_name}' not found.")
# Display the DataFrame
print(df)
Song Title View Count Release Date 0 03' Adolescence 1359521 2014-12-09 1 1 0 0 . m i l ’ 379991 2021-05-14 2 1-888-88-DREAM 15445 2014-01-28 3 1985 1668465 2018-04-20 4 1993 324838 2019-07-05 .. ... ... ... 438 Work Out 1306066 2011-06-15 439 World is Empty 84477 2009-06-15 440 You Got It 193085 2010-11-12 441 Your Heart 293577 2021-09-24 442 Zendaya 189916 2018-02-13 [443 rows x 3 columns]
Based on the graph above, it appears that J. Cole's standout year was 2014. Despite releasing a substantial amount of music in the years between 2010 and 2013, the view counts for each were subpar compared to 2014. Notably, this was the year of one of his most iconic albums, "2014 Forest Hills Drive."
With that context, in the chunk below, I will conduct further regression analysis on all songs released in 2014. The objective is to explore if valence, the measure of how positive a song sounds, played a role in their popularity. Given that the Genius API lacks such data, I will utilize the Spotify API to access valence information. This analysis aims to unveil insights into the relationship between valence and view counts, and wether they contributed to Jcoles most succesful year
# Set up Genius API credentials (50 daily limit)
client_id = "1e6f9168e7a14e6fa864b7417bf97416"
client_secret = "736e86b1c08a40b88c967e3392ee57e9"
# Spotify API endpoint for obtaining an access token
token_url = "https://accounts.spotify.com/api/token"
# Set up the headers and payload for the token request
headers = {
"Content-Type": "application/x-www-form-urlencoded",
}
payload = {
"grant_type": "client_credentials",
"client_id": client_id,
"client_secret": client_secret,
}
response = requests.post(token_url, headers=headers, data=payload)
token_data = response.json()
# Check if the access token was obtained successfully
if "access_token" in token_data:
access_token = token_data["access_token"]
track_cache = {} # Memoization cache
def get_track_info(track_id):
# Check if track information is already in the cache
if track_id in track_cache:
return track_cache[track_id]
# Make API call to get track information
track_url = f"https://api.spotify.com/v1/audio-features/{track_id}"
track_response = requests.get(track_url, headers=headers)
track_data = track_response.json()
# Check if the track information was retrieved successfully
if "valence" in track_data:
# Cache the track information
track_cache[track_id] = track_data
return track_data
else:
print(f"Unable to retrieve valence for track with ID {track_id}")
return None
# Get all tracks for Jcole, artist_id obtained online
artist_id = "6l3HvQ5sa6mXTsMTB19rO5" # J. Cole's artist ID
artist_url = f"https://api.spotify.com/v1/artists/{artist_id}/albums"
headers = {
"Authorization": f"Bearer {access_token}",
}
artist_response = requests.get(artist_url, headers=headers)
artist_data = artist_response.json()
# Check if the artist information was retrieved successfully
if "items" in artist_data:
all_tracks = []
# Iterate through each album and get its tracks
for album in artist_data["items"]:
album_id = album["id"]
album_url = f"https://api.spotify.com/v1/albums/{album_id}/tracks"
album_response = requests.get(album_url, headers=headers)
album_tracks = album_response.json()["items"]
# Add the tracks to the list
all_tracks.extend(album_tracks)
# Create a DataFrame to store the data
df_data = {'Track Name': [], 'Valence': []}
# Populate the DataFrame with track information
for track in all_tracks:
track_id = track["id"]
track_info = get_track_info(track_id)
if track_info:
# Append data to the DataFrame
df_data['Track Name'].append(track['name'])
df_data['Valence'].append(track_info['valence'])
# Create a DataFrame from the collected data
spotify_df = pd.DataFrame(df_data)
# Print the DataFrame
print(spotify_df)
else:
print("Unable to retrieve artist information.")
else:
print("Failed to obtain access token.")
Track Name Valence 0 Stick (with JID & J. Cole feat. Kenny Mason & ... 0.597 1 Ghetto Gods Freestyle (with EARTHGANG feat. 2 ... 0.584 2 Lifestyle (with Bas feat. A$AP Ferg) 0.586 3 Starting 5 (with Lute, Cozz & Omen) 0.713 4 Coming Down (with Ari Lennox) 0.622 .. ... ... 154 90 Proof (with J. Cole) 0.528 155 LONDON (feat. J. Cole) 0.563 156 Scared Money (feat. J. Cole and Moneybagg Yo) 0.661 157 Johnny P's Caddy 0.581 158 Poke It Out (feat. J. Cole) 0.803 [159 rows x 2 columns]
In the above Chunck, I created a datatable containing the song name and Valence from the Spotify API.
if 'hits' in data['response'] and data['response']['hits']:
# Access artist information
artist_id = data['response']['hits'][0]['result']['primary_artist']['id']
# Initialize variables for pagination, Genius only allows 20 songs at once
per_page = 20
page = 1
all_artist_songs = []
while True:
genius_artist_songs_url = f'{base_url}/artists/{artist_id}/songs?per_page={per_page}&page={page}'
genius_response = requests.get(genius_artist_songs_url, headers=header)
genius_songs_data = genius_response.json()['response']['songs']
if not genius_songs_data:
break # No more songs, exit the loop
# Append songs to the list
all_artist_songs.extend(genius_songs_data)
page += 1
# Remove songs without 'pageviews' field
all_artist_songs = [song for song in all_artist_songs if 'stats' in song and 'pageviews' in song['stats']]
# Extract data for DataFrame
song_titles = [song['title'] for song in all_artist_songs]
view_counts = [song['stats']['pageviews'] for song in all_artist_songs]
release_dates = [song['release_date_components'] for song in all_artist_songs]
# Create DataFrame
genius_df = pd.DataFrame({
'Song Title': song_titles,
'View Count': view_counts,
'Release Date': release_dates
})
# Convert release dates to a readable format
genius_df['Release Date'] = pd.to_datetime(genius_df['Release Date'].apply(lambda x: f"{x['year']}-{x['month']}-{x['day']}" if x is not None else None), errors='coerce')
# Print the DataFrame
print(genius_df)
else:
print(f"Artist '{artist_name}' not found.")
Song Title View Count Release Date 0 03' Adolescence 1359521 2014-12-09 1 1 0 0 . m i l ’ 379991 2021-05-14 2 1-888-88-DREAM 15445 2014-01-28 3 1985 1668465 2018-04-20 4 1993 324838 2019-07-05 .. ... ... ... 438 Work Out 1306066 2011-06-15 439 World is Empty 84477 2009-06-15 440 You Got It 193085 2010-11-12 441 Your Heart 293577 2021-09-24 442 Zendaya 189916 2018-02-13 [443 rows x 3 columns]
In the code chunck above, I have also extracted the song title, the view count and the release Date from the Geniues API. Now that these two dataframes have a column in column, I will be merging them using the song title column and preprocess the resulting data.
#Data cleaning
spotify_df.rename(columns={'Track Name': 'Song Title'}, inplace=True)
# Remove non-alphanumeric characters and multiple whitespaces for Spotify dataframe
spotify_df['Song Title'] = spotify_df['Song Title'].str.lower().str.replace(r'\W+', '').str.strip()
# Remove non-alphanumeric characters and multiple whitespaces for Genius dataframe
genius_df['Song Title'] = genius_df['Song Title'].str.lower().str.replace(r'\W+', '').str.strip()
print(spotify_df)
print(genius_df)
Song Title Valence
0 stickwithjidjcolefeatkennymasonsheckwes 0.597
1 ghettogodsfreestylewithearthgangfeat2chainz 0.584
2 lifestylewithbasfeataapferg 0.586
3 starting5withlutecozzomen 0.713
4 comingdownwitharilennox 0.622
.. ... ...
154 90proofwithjcole 0.528
155 londonfeatjcole 0.563
156 scaredmoneyfeatjcoleandmoneybaggyo 0.661
157 johnnypscaddy 0.581
158 pokeitoutfeatjcole 0.803
[159 rows x 2 columns]
Song Title View Count Release Date
0 03adolescence 1359521 2014-12-09
1 100mil 379991 2021-05-14
2 188888dream 15445 2014-01-28
3 1985 1668465 2018-04-20
4 1993 324838 2019-07-05
.. ... ... ...
438 workout 1306066 2011-06-15
439 worldisempty 84477 2009-06-15
440 yougotit 193085 2010-11-12
441 yourheart 293577 2021-09-24
442 zendaya 189916 2018-02-13
[443 rows x 3 columns]
C:\Users\rageg\AppData\Local\Temp\ipykernel_29532\1252738554.py:5: FutureWarning: The default value of regex will change from True to False in a future version. C:\Users\rageg\AppData\Local\Temp\ipykernel_29532\1252738554.py:8: FutureWarning: The default value of regex will change from True to False in a future version.
#Merging the two datasets on Song Title
merged_df = pd.merge(spotify_df, genius_df, on='Song Title', how='inner')
#print(merged_df)
#Removing Duplicate songs.
merged_df = merged_df.drop_duplicates(subset="Song Title", keep="first")
print(merged_df)
Song Title Valence View Count Release Date 0 95south 0.203 790139 2021-05-14 1 amari 0.207 845154 2021-05-14 2 applyingpressure 0.404 503909 2021-05-14 3 punchintheclock 0.692 370470 2021-05-14 4 interlude 0.104 894811 2021-05-07 .. ... ... ... ... 72 godsgift 0.439 182831 2011-09-27 73 breakdown 0.489 226802 2011-09-11 74 workout 0.216 1306066 2011-06-15 75 thesecretrecipe 0.690 174949 2023-09-28 76 johnnypscaddy 0.581 350214 2022-01-28 [72 rows x 4 columns]
I have successfully merged the two datasets around common song. To do this, I made every value under the song title column lowercase, removed all spaces, and compared the equality of their value for the merging. This ensured that all of the songs that have been merged are indeed the same version. Those that could not be merged correctly, were ignored for the purpose of accuracy. Below I will plot the Valence vs Viewcount of all of Jcoles 2014 songs to see if their level of happiness at the time, impacted their viewcout.
#Filtering for songs in 2014
filtered_df = merged_df[merged_df['Release Date'].dt.year == 2014]
print(filtered_df)
fig = px.scatter(filtered_df, x='Valence', y='View Count', text='Song Title', title='Valence vs View Count for Songs Released in 2014',
labels={'Valence': 'Valence', 'View Count': 'View Count'})
fig.update_traces(textposition='top center', texttemplate='%{text}', hovertemplate='%{text}<br>Valence: %{x}<br>View Count: %{y}', mode='markers')
fig.show()
#Ppringt coorelatoin coefficient for analysis
correlation_coefficient = filtered_df['Valence'].corr(filtered_df['View Count'])
print(f"Correlation Coefficient: {correlation_coefficient}")
Song Title Valence View Count Release Date 34 january28th 0.342 1203207 2014-12-09 35 wetdreamz 0.539 3435206 2014-12-09 36 03adolescence 0.260 1359521 2014-12-09 37 ataleof2citiez 0.343 1614927 2014-12-09 38 firesquad 0.574 1790773 2014-12-09 39 sttropez 0.317 433841 2014-12-09 40 gomd 0.336 2116386 2014-12-09 41 norolemodelz 0.494 5893124 2014-12-09 42 hello 0.318 590343 2014-12-09 43 apparently 0.570 1968202 2014-12-09 44 loveyourz 0.435 2410896 2014-12-09 45 notetoself 0.333 580015 2014-12-09
Correlation Coefficient: 0.5577323811487858
From the graph above it appears theres little to no coorelation between View Count and Valence for the songs released in 2014. But as the correlation coefficient of 0.56 suggests, there seems to some correlation between the valence and the view count of the songs released in 2014. This suggesting with higher Valence, there seems to be somewhat of a higher view count in general. Let us see if this remains true in the next plot in valence vs Viewcount of all of the songs extracted.
fig = px.scatter(merged_df, x='Valence', y='View Count', text='Song Title', title='Valence vs View Count')
fig.update_traces(textposition='top center', texttemplate='%{text}', hovertemplate='%{text}<br>Valence: %{x}<br>View Count: %{y}', mode='markers')
fig.show()
Based on the plot above, it can be observed that there can be little to no coorelation between how happy a soung sounds(valence) and how many views it gets in regard to the artist J.Cole. To further prove this I have extracted the correlation coefficient below.
correlation = merged_df['Valence'].corr(merged_df['View Count'])
print(f"Correlation between Valence and View Count: {correlation}")
Correlation between Valence and View Count: -0.03327032911909083
As previously observed, there is not much coorelation between valence and view count for the songs that was able to pass the preprocessing.
Based on the analysis of J. Cole's songs, it appears that there is a moderate correlation between the valence (positivity) of his songs and their view count for the year 2014. However, when considering all of his songs, the correlation is less pronounced, suggesting that factors other than valence may contribute to the overall popularity of his music. Further investigation and analysis may be needed to uncover additional insights into the factors influencing the popularity of J. Cole's songs across different periods.